print(2+3)
print('Hello World')
print('Learning Pythom with Ammar')
5 Hello World Learning Pythom with Ammar
print(5+5)
print(9-5)
print(8/4) # gives you float type result
print(8//4) # gives you a whole number
print(4*7)
print(9%2)
print(8**2) # raise a number by given , it is a complex operation and there are methods to do it.
print(9+2**4/6*4-5)
10 4 2.0 2 28 1 64 14.666666666666664
PEMDAS Parenthesis Exponents Multiply Divide Addition Substraction
Left to Right sequence for M D A S
print('Hello World')
print('Learning Pythom with Ammar')
print('single quote')
print("double quote")
print('''tripple quote''')
# print('What's quote') #gives error
print("What's quote")
print('''What's quote''')
Hello World Learning Pythom with Ammar single quote double quote tripple quote What's quote What's quote
Press Ctrl+K+C to comment and Ctrl+K+U to uncomment, since in my PC Ctrl+/ doesn't work might be because of spanish keyboard, operating system keyboard shortcut is overriding the vscode shortcut
Resolved: Ctrl+Numpad/ it was due to the spanish keyboard layout
Variables: objects containing specific values
x= 5 #numeric or integer variable
print(x)
y="learning python with Ammar" #string variables
print(y)
x=x+3
print(x)
# class/types of variables
print(type(x), type(y))
fruit_baskets="Mangoes"
# del fruit_baskets # delete variable
print(fruit_baskets,type(fruit_baskets))
5 learning python with Ammar 8 <class 'int'> <class 'str'> Mangoes <class 'str'>
Rules to assign a variable
fruit_basket="Mangoes"
print(fruit_basket)
# input function
# Ask the user to enter values
fruit_basket=input("What is your favourite fruit? ")
print(fruit_basket)
# input function of 2nd stage
name= input("What is your name?")
greetings= "Hello!"
print(greetings, name)
# Another way
name= input("What is your name?")
print("Hello!", name)
#input function of 3rd stage
name= input("What is your name?")
age= input("What is your age?")
height=input("What is your height?")
greetings= "Hello!"
print(greetings, name, "You are still young!") # just by mentioning one input function all rest of inpput functions run automatically
Mangoes What is your favourite fruit? apple apple What is your name?kom Hello! kom What is your name?kom Hello! kom What is your name?kom What is your age?9 What is your height?9 Hello! kom You are still young!
equal to ==
not equal to !=
greater than >
less than <
less than and equal to <=
greater than and equal to >=
print(4==4)
print(4<9)
print(4!=5)
# applications of logical operators
samad_age=8
age_at_school=5
print(samad_age==age_at_school)
# input function and logical ooerators
age_at_school=5
samad_age=input("How old is samad?")
print(samad_age)
print(type(samad_age))
samad_age=int(samad_age)
print(type(samad_age))
print(samad_age==age_at_school)
True True True False How old is samad?9 9 <class 'str'> <class 'int'> False
x= 1 #integer
y= 34.2 #float
z="Hello!" #string
x=x*y # results in float if one is float
print(type(x))
#implicit type conversion
x=x+y # results in float if one is float
print(x,"Type of x is: ",type(x))
#explicit type conversion
age=input("What is your age? ") # results in float if one is float
print(age,type(int(age)))
print(("18.5")) # can't change a decimal number in a string into float
print(int(18.5)) #BUT can change a decimal number into integer
print(str(18.5)) #change any number to string
<class 'float'> 68.4 Type of x is: <class 'float'> What is your age? 9 9 <class 'int'> 18.5 18 18.5
required_age_at_school= 5
samad_age= 4
# question: can samad go to school
if samad_age==required_age_at_school:
print("samad can join the school")
elif samad_age>=required_age_at_school:
print("samad should join higher seondary school")
elif samad_age<=2:
print("you should take care of samad, he is still a baby!")
else:
print("samad can not go to school")
samad can not go to school
print("Learning with Aammar")
print("Learning with Aammar")
print("Learning with Aammar")
print("Learning with Aammar")
print("Learning with Aammar")
print("Learning with Aammar")
#defining functions
#1
def print_codanics():
print("Learning with Aammar")
print("Learning with Aammar")
print("Learning with Aammar")
print_codanics()
#2
def print_codanics():
text = "Learning with Aammar"
print(text)
print(text)
print(text)
print_codanics()
#3
def print_codanics(text):
print(text)
print(text)
print(text)
print_codanics("Learning with Aammar")
# defining a function with if, elif and else statements
def school_calculator(age, text):
if age==5:
print("samad can join the school")
elif age > 5:
print("samad should join higher seondary school")
else:
print("you should take care of samad, he is still a baby!")
school_calculator(2,"SAMAD")
# defining a function of future
def future_age(age):
new_age=age+20
return new_age
print(new_age)
future_predicted_age= future_age(18)
print(future_predicted_age)
Learning with Aammar Learning with Aammar Learning with Aammar Learning with Aammar Learning with Aammar Learning with Aammar Learning with Aammar Learning with Aammar Learning with Aammar Learning with Aammar Learning with Aammar Learning with Aammar Learning with Aammar Learning with Aammar Learning with Aammar you should take care of samad, he is still a baby! 38
While and for Loops while loop
x=0
while(x<=5):
print(x)
x=x+1
0 1 2 3 4 5
for loop
for x in range(4,8):
print(x)
# array
days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat","Sun"]
for d in days:
if(d=="Fri"):break #loop stops
if(d=="Fri"):continue # skip d
print(d)
4 5 6 7 Mon Tue Wed Thu
to use already built-in functions from open source libraries
import math
print("The value of pi is ", math.pi)
import statistics
x=[150,250,350,450]
print(statistics.mean(x))
The value of pi is 3.141592653589793 300
numpy, pandas, ... for statistical analysis , data visualization
# print(learning python with Aammar) # syntax error i.e. python language is not used correctly
# print(25/0) # run time error is mathematical mistake
name= "suboor"
print("Hello name") #semantic error human error in writing, dfficult to troubleshoot
print("Hello "+ name) # you don't need , if you use +
Hello name Hello suboor
name= input("What is your name? ")
weight= input("What is your weight? ")
height= input("What is your height? ")
BMI=float(weight)/(float(height)**2) #input returns a string; ** requires numbers.
print("My Name is", name ,"and my BMI is", BMI)
What is your name? kom What is your weight? 65 What is your height? 163 My Name is kom and my BMI is 0.002446460160337235
Steps involved in visualization BARPLOTS
# Step-1 import libraries
import seaborn as sns
import matplotlib.pyplot as plt
# Step-2 set a theme
sns.set_theme(style="ticks", color_codes=True)
# Step-3 import dataset (you can also import your own dataset)
titanic = sns.load_dataset("titanic")
# sns.get_datasets_names() # this command is to know which datasets are available in seaborn
# Step-4 plot basic graph
sns.catplot(x='sex', y='survived', hue='class',kind='bar', data=titanic)
plt.show()
COUNTPLOTS
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="ticks", color_codes=True)
titanic = sns.load_dataset("titanic")
p1=sns.countplot(x='sex', data=titanic, hue='class')
p1.set_title("plot for counting")
plt.show()
SCATTERPLOTS
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="ticks", color_codes=True)
titanic = sns.load_dataset("titanic")
#print(titanic) # to know which catogeries are there in data which you may plot
g=sns.FacetGrid(titanic, row='sex', hue='alone')
g=(g.map(plt.scatter,'age','fare').add_legend())
plt.show()
# Steps involved in visualization
# Step-1 import libraries
import pandas as pd # to use your own data-set and to decribe the qualities of the data
import seaborn as sns
import matplotlib.pyplot as plt
# Step-2 set a theme
sns.set_theme(style="ticks", color_codes=True)
# Step-3 import dataset (you can also import your own dataset)
dataviz=pd.read_csv('data_viz.csv')
print(dataviz)
# Step-4 plot basic graph
# p1=sns.countplot(x='Gender', data=dataviz, hue='Age')
p1=sns.countplot(x='Gender', data=dataviz, hue='Time of class (pm)')
p1.set_title("Voting for Time of Class")
plt.show()
Timestamp Gender Age Location Time of class (pm) \
0 1/3/2022 19:09:29 Male 16-30 Pakistan 10:30
1 1/3/2022 19:09:33 Male 16-30 Pakistan 10:00
2 1/3/2022 19:09:33 Male 16-30 Pakistan 10:00
3 1/3/2022 19:09:33 Male 30-40 Pakistan 09:30
4 1/3/2022 19:09:34 Male 16-30 East 09:30
.. ... ... ... ... ...
301 1/3/2022 19:11:51 Male 16-30 Pakistan 09:30
302 1/3/2022 19:11:52 Male 16-30 Pakistan 10:30
303 1/3/2022 19:11:53 Male 16-30 Pakistan 10:00
304 1/3/2022 19:11:54 Female 16-30 Pakistan 10:30
305 1/3/2022 19:11:55 Male 16-30 Pakistan 10:30
Duration (min)
0 60
1 60
2 30
3 30
4 60
.. ...
301 30
302 45
303 60
304 60
305 45
[306 rows x 6 columns]
#make a string
a="samosa pakora"
a
'samosa pakora'
#length of indices
len(a)
13
a[0]
's'
a[1]
'a'
a[2]
'm'
# last index is an exclusive element
a[0:6]
'samosa'
# reverse indexing
a[-2]
'r'
a[-1:6] # not ethical in programming terms
''
a[-6:13] #to get the string till the end, using length of string
'pakora'
a[-6:-1]
'pakor'
food='biryani'
len(food)
7
# capitalise every element
food.capitalize()
'Biryani'
#lowercase letters
food.lower()
'biryani'
# replace
food.replace('b','sh')
'shiryani'
#counting a specific letter in a string
name="Komal Khan"
name.count('a') #case sensative
2
# finding an index number in string
name.find('n')
9
# how to split a string
food= "I love biryani,haleem, chinese rice and nihari"
food
'I love biryani,haleem, chinese rice and nihari'
food.split(",")
['I love biryani', 'haleem', ' chinese rice and nihari']
tup1= (3, 'python', True, 3.16)
tup1
(3, 'python', True, 3.16)
tup2= (2, 'data', 9.45, False)
tup2
(2, 'data', 9.45, False)
#addition of tuples
tup3= tup1+tup2
tup3
(3, 'python', True, 3.16, 2, 'data', 9.45, False)
#multiplication of tuples
tup3= tup1*2+tup2
tup3
(3, 'python', True, 3.16, 3, 'python', True, 3.16, 2, 'data', 9.45, False)
tup3=(20,30,5,89,63,55,4)
tup3
(20, 30, 5, 89, 63, 55, 4)
tup3*2
(20, 30, 5, 89, 63, 55, 4, 20, 30, 5, 89, 63, 55, 4)
min(tup3)
4
max(tup3)
89
Python has two built-in methods that you can use on tuples.
tup1.count('python')
1
tup1.index(3.16)
3
list1=[8,"structures", True]
list1
[8, 'structures', True]
type(list1)
list
len(list1)
3
list1[1]
'structures'
list2=[2,4,5,"learning","python",6.9,True]
list2
[2, 4, 5, 'learning', 'python', 6.9, True]
# Reverses the order of the list
list1.reverse()
# Adds an element at the end of the list
list1.append("with Aammar")
# list1.count() #syntax of the count() method is list1.count(element) where element is specified to be counted as appearing in the list,
list1.count("structures") # 1 and true are counted as same
1
list3=[20,30,5,89,63,55,4]
list3
[20, 30, 5, 89, 63, 55, 4]
len(list3)
7
#sorting a list
list3.sort()
list3
[4, 5, 20, 30, 55, 63, 89]
list3*2
[4, 5, 20, 30, 55, 63, 89, 4, 5, 20, 30, 55, 63, 89]
list1+list2
[True, 'structures', 8, 'with Aammar', 2, 4, 5, 'learning', 'python', 6.9, True]
# Removes all the elements from the list
list3.clear()
list3
[]
# Returns a copy of the list
list3 =list1.copy()
list3
[True, 'structures', 8, 'with Aammar']
# Add the elements of a list (or any iterable), to the end of the current list
list3.extend(['BMW'])
list3
[True, 'structures', 8, 'with Aammar', 'BMW']
# Removes the first item with the specified value
list3.remove('BMW')
list3
[True, 'structures', 8, 'with Aammar']
# Adds an element at the specified position
list3.insert(2,'Volvo')
list3
[True, 'structures', 'Volvo', 8, 'with Aammar']
# Removes the element at the specified position
list3.pop(2)
list3
[True, 'structures', 8, 'with Aammar']
# fruits and their prices
fruit1={"Apple":12, "Banana":20, "Pineapple":40, "Tomatoes":23}
fruit1
{'Apple': 12, 'Banana': 20, 'Pineapple': 40, 'Tomatoes': 23}
type(fruit1)
dict
#extract data. Returns a list containing the dictionary's keys
keys1=fruit1.keys()
keys1
dict_keys(['Apple', 'Banana', 'Pineapple', 'Tomatoes'])
# Returns a list of all the values in the dictionary
values1=fruit1.values()
values1
dict_values([12, 20, 40, 23])
# adding new element
fruit1["Mangoes"]=10
fruit1
{'Apple': 12, 'Banana': 20, 'Pineapple': 40, 'Tomatoes': 23, 'Mangoes': 10}
#update new element
fruit1["Mangoes"]=45
fruit1
{'Apple': 12, 'Banana': 20, 'Pineapple': 40, 'Tomatoes': 23, 'Mangoes': 45}
fruit2={"Melon":55, "Apricot":45, "Guava":40, "Strawberry":70}
fruit2
{'Melon': 55, 'Apricot': 45, 'Guava': 40, 'Strawberry': 70}
#Concatinate
#fruit1+ fruit2 # can't be done in dictonaries because its unordered collection. Updates the dictionary with the specified key-value pairs
fruit1.update(fruit2)
fruit1
{'Apple': 12,
'Banana': 20,
'Pineapple': 40,
'Tomatoes': 23,
'Mangoes': 45,
'Melon': 55,
'Apricot': 45,
'Guava': 40,
'Strawberry': 70}
# Returns a copy of the dictionary
fruit3=fruit1.copy()
fruit3
{'Apple': 12,
'Banana': 20,
'Pineapple': 40,
'Tomatoes': 23,
'Mangoes': 45,
'Melon': 55,
'Apricot': 45,
'Guava': 40,
'Strawberry': 70}
# Removes all the elements from the dictionary
fruit3.clear()
fruit3
{}
# Returns a dictionary with the specified keys and value
x = ('key1', 'key2', 'key3')
y = 0
new_dict=dict.fromkeys(x,y)
new_dict
{'key1': 0, 'key2': 0, 'key3': 0}
# Returns the value of the specified key
fruit=fruit1.get('Banana')
fruit
20
# Returns a list containing a tuple for each key value pair
fruit=fruit1.items()
fruit
dict_items([('Apple', 12), ('Banana', 20), ('Pineapple', 40), ('Tomatoes', 23), ('Mangoes', 45), ('Melon', 55), ('Apricot', 45), ('Guava', 40), ('Strawberry', 70)])
# Removes the element with the specified key
fruit1.pop("Banana")
fruit1
{'Apple': 12,
'Pineapple': 40,
'Tomatoes': 23,
'Mangoes': 45,
'Melon': 55,
'Apricot': 45,
'Guava': 40,
'Strawberry': 70}
# Removes the last inserted key-value pair
fruit1.popitem()
fruit1
{'Apple': 12,
'Pineapple': 40,
'Tomatoes': 23,
'Mangoes': 45,
'Melon': 55,
'Apricot': 45,
'Guava': 40}
# Returns the value of the specified key. If the key does not exist: insert the key, with the specified value
fruit=fruit1.setdefault('Melon',"20")
fruit
55
s1={1,2,"Codanics","python",True}
s1
{1, 2, 'Codanics', 'python'}
# Adds an element to the set
s1.add("Aammar1")
s1
{1, 2, 'Aammar1', 'Codanics', 'python'}
# Removes the specified element
s1.remove("Aammar1")
s1
{1, 2, 'Codanics', 'python'}
# Returns a copy of the set
fruits = {"apple", "banana", "cherry"}
x = fruits.copy()
x
{'apple', 'banana', 'cherry'}
# Removes all the elements from the set
fruits.clear()
fruits
set()
# Returns a set containing the difference between two or more sets
x = {"apple", "banana", "cherry"}
y = {"google", "microsoft", "apple"}
z = x.difference(y)
z
{'banana', 'cherry'}
# Returns a set, that is the intersection of two or more sets
z = x.intersection(y)
z
{'apple'}
# Removes the items in this set that are also included in another, specified set
x.difference_update(y)
x
{'banana', 'cherry'}
# Remove the specified item
y.discard("google")
y
{'apple', 'microsoft'}
# Removes the items in this set that are not present in other, specified set(s)
x = {"apple", "banana", "cherry"}
y = {"google", "microsoft", "apple"}
x.intersection_update(y)
x
{'apple'}
# Returns whether two sets have a intersection or not
x = {"apple", "banana", "cherry"}
y = {"google", "microsoft", "facebook"}
z = x.isdisjoint(y)
z
True
# Returns whether another set contains this set or not
x = {"a", "b", "c"}
y = {"f", "e", "d", "c", "b", "a"}
z = x.issubset(y)
z
True
# Returns whether this set contains another set or not
z = x.issuperset(y)
z
False
# Removes an element from the set
y.pop()
y
{'a', 'c', 'd', 'e', 'f'}
# Returns a set with the symmetric differences of two sets
z = x.symmetric_difference(y)
z
{'b', 'd', 'e', 'f'}
# inserts the symmetric differences from this set and another
x.symmetric_difference_update(y)
x
{'b', 'd', 'e', 'f'}
# Return a set containing the union of sets, duplicates are excluded
x = {"apple", "banana", "cherry"}
y = {"google", "microsoft", "apple"}
z = x.union(y)
z
{'apple', 'banana', 'cherry', 'google', 'microsoft'}
# update() Update the set with another set, or any other iterable
x.update(y)
x
{'apple', 'banana', 'cherry', 'google', 'microsoft'}
#import libraries
import seaborn as sns
import matplotlib.pyplot as plt
#load dataset
phool= sns.load_dataset("iris")
phool
#draw a line plot
sns.lineplot(x="sepal_length",y="sepal_width",data=phool)
plt.show()
#import libraries
import seaborn as sns
import matplotlib.pyplot as plt
#load dataset
phool= sns.load_dataset("iris")
phool
#draw a line plot
sns.lineplot(x="sepal_length",y="sepal_width",data=phool)
plt.title("Phoolo ka Plot")
plt.show()
#import libraries
import seaborn as sns
import matplotlib.pyplot as plt
#load dataset
phool= sns.load_dataset("iris")
phool
#draw a line plot
sns.lineplot(x="sepal_length",y="sepal_width",data=phool)
plt.title("Phoolo ka Plot")
plt.xlim(4)
plt.xlim(3)
plt.show()
#import libraries
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style(style=None, rc=None)
#load dataset
phool= sns.load_dataset("iris")
phool
#draw a line plot
sns.lineplot(x="sepal_length",y="sepal_width",data=phool)
plt.title("Phoolo ka Plot")
sns.set_style("dark")
plt.show()
#import libraries
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style(style=None, rc=None)
#load dataset
phool= sns.load_dataset("iris")
phool
# change size of figure
plt.figure(figsize=(6,6))
#draw a line plot
sns.lineplot(x="sepal_length",y="sepal_width",data=phool)
plt.title("Phoolo ka Plot")
sns.set_style("dark")
plt.show()
#import libraries
import seaborn as sns
import matplotlib.pyplot as plt
#load dataset
phool= sns.load_dataset("iris")
phool
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
| ... | ... | ... | ... | ... | ... |
| 145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
| 146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
| 147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
| 148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
| 149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows � 5 columns
#draw a bar plot
sns.barplot(x="species",y="sepal_width", data=phool)
plt.show()
#import libraries
import seaborn as sns
import matplotlib.pyplot as plt
#load dataset
kashti= sns.load_dataset("titanic")
kashti
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 0 | 2 | male | 27.0 | 0 | 0 | 13.0000 | S | Second | man | True | NaN | Southampton | no | True |
| 887 | 1 | 1 | female | 19.0 | 0 | 0 | 30.0000 | S | First | woman | False | B | Southampton | yes | True |
| 888 | 0 | 3 | female | NaN | 1 | 2 | 23.4500 | S | Third | woman | False | NaN | Southampton | no | False |
| 889 | 1 | 1 | male | 26.0 | 0 | 0 | 30.0000 | C | First | man | True | C | Cherbourg | yes | True |
| 890 | 0 | 3 | male | 32.0 | 0 | 0 | 7.7500 | Q | Third | man | True | NaN | Queenstown | no | True |
891 rows � 15 columns
# order the bars in barplot
sns.barplot(x="sex", y="survived", hue="alone", data=kashti, order=["female","male"])
plt.show()
# adding colors to the bars in barplot
sns.barplot(x="sex", y="survived", hue="alone", data=kashti, order=["female","male"], color="salmon")
plt.show()
# removing line form the bars in barplot
sns.barplot(x="sex", y="survived", hue="alone", data=kashti, order=["female","male"], color="salmon", ci=None)# confidence interval will be learned in statistical class
plt.show()
# Palettes in barplot
sns.barplot(x="sex", y="survived", hue="alone", data=kashti, order=["female","male"], color="red", ci=None,
palette="bright")# confidence interval will be learned in statistical class
plt.show()
# estimator can be also applied, but we need numpy library to use such methods
#import libraries
import seaborn as sns
import matplotlib.pyplot as plt
#load dataset
kashti= sns.load_dataset("titanic")
kashti
from numpy import mean, median
sns.barplot(x="sex", y="survived", hue="alone", data=kashti, color="red", estimator=mean)
plt.show()
# changing saturation
sns.barplot(x="sex", y="survived", hue="alone", data=kashti, color="red", estimator=mean, saturation=.5)
plt.show()
# Horizontal plot
import seaborn as sns
import matplotlib.pyplot as plt
#load dataset
kashti= sns.load_dataset("titanic")
kashti
from numpy import mean, median
sns.barplot(x="fare", y="class", hue="alone", data=kashti, color="red", estimator=mean)
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
#load dataset
kashti= sns.load_dataset("titanic")
kashti
from numpy import mean, median
sns.barplot(x="sex", y="fare", data=kashti,
linewidth=2.5, facecolor=(1,1,1,0),
errcolor=".2",edgecolor=".2") # These 4 arguments of facecolor represents - RGBA (red, green, blue, alpha) where RGB are color intensities that can be defined as a value within range [0, 1] and A controls opacity of the facecolor also a value within range [0, 1]
<AxesSubplot:xlabel='sex', ylabel='fare'>
import seaborn
seaborn.set_style("whitegrid") # theme of canvas
kashti=seaborn.load_dataset("titanic")
kashti
seaborn.boxplot(x="class", y="fare", data=kashti)
<AxesSubplot:xlabel='class', ylabel='fare'>
import seaborn
seaborn.set_style("whitegrid") # theme of canvas
tip=seaborn.load_dataset("tips")
tip
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows � 7 columns
seaborn.boxplot(x="day", y="tip", data=tip, saturation=0.1)
<AxesSubplot:xlabel='day', ylabel='tip'>
# estimator
import seaborn as sns
seaborn.set_style("whitegrid") # theme of canvas
tip=seaborn.load_dataset("tips")
tip
seaborn.boxplot(x="sex", y="tip", data=tip, saturation=0.5) # estimator=mean can't be used here because of the type of the plot
<AxesSubplot:xlabel='sex', ylabel='tip'>
# describing summary of numeric data
import seaborn as sns
seaborn.set_style("whitegrid") # theme of canvas
tip=sns.load_dataset("tips")
tip.describe() # numeric data is mostly drawn on y-axis while categorical variables on x-axis and hue,
# never put numeric data in hue. More details and reasons will be explained later
sns.boxplot(x="total_bill", data=tip) # Indexing is used to access the elements of column "total_bill" in dataframe "tip"
<AxesSubplot:xlabel='total_bill'>
sns.boxplot(y="total_bill", data=tip)
<AxesSubplot:ylabel='total_bill'>
#tip[['total_bill', 'tip']].plot(kind='box', title='boxplot')
<AxesSubplot:title={'center':'boxplot'}>
import seaborn as sns
import pandas as pd
sns.set_style("whitegrid") # theme of canvas
tip=sns.load_dataset("tips")
#sns.boxplot(x=tip["total_bill","tip"], data=tip) # doesn't work
# Trying this
tip_long=pd.melt(tip) # to "melt" the sample dataframe into its "long-form" where each column is a variable and
# each row is an observation
tip_long
# sns.boxplot(x="total_bill", y="tip", data=tip_long) # doesn't work either
import seaborn as sns
sns.set_style("whitegrid") # theme of canvas
tip=sns.load_dataset("tips")
tip
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
| 240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
| 241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
| 242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
| 243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows � 7 columns
# checking hue, orient, dodge features of boxplot
sns.boxplot(x="tip", y="day", hue="smoker",data=tip, palette="Set2", dodge=True ) # keeping arguments in order is very important
<AxesSubplot:xlabel='tip', ylabel='day'>
# Changing Color of own choice: to do this search in Google color picker and select your own color and copy HEX code
sns.boxplot(x="tip", y="day", data=tip, color="#a742f5")
<AxesSubplot:xlabel='tip', ylabel='day'>
# Assignment: how to manage individual color for each hue color?
palette = ['tab:blue', 'tab:green']
sns.boxplot(x="tip", y="day", saturation= 1, data=tip, orient="h", hue="smoker",
palette=palette)
# The hue variable of seaborn.barplot() is mapped via palette:
# palette: palette name, list, or dict
# Colors to use for the different levels of the hue variable. Should be something
# that can be interpreted by seaborn.color_palette(), or a dictionary mapping hue
# levels to matplotlib colors.
# So to customize your hue colors,
# either define a color list:
# palette = ['tab:blue', 'tab:green']
# or a hue-color dictionary:
# palette = {
# 'Yes': 'tab:blue',
# 'No': 'tab:green',
# }
# And pass that to palette:
# sns.barplot(x="tip", y="day", saturation= 1, data=tip, orient="h", hue="smoker",
# palette=palette)
<AxesSubplot:xlabel='tip', ylabel='day'>
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
sns.set_style("whitegrid") # theme of canvas
kashti=sns.load_dataset("titanic")
kashti.head() # To get the headers of all data
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
# to show mean symbol
sns.boxplot(x="survived", y="age", data=kashti, showmeans=True )
<AxesSubplot:xlabel='survived', ylabel='age'>
# to show and control labels symbols, markers
sns.boxplot(x="survived", y="age", showmeans=True,
meanprops={"marker": "*",
"markersize": "12",
"markeredgecolor": "yellow"}, data=kashti)
plt.xlabel("How many survived?", size=10)
plt.ylabel("Age (years)",size=10)
plt.title("Boxplot for Titanic Survivors",size=10, weight="bold")
plt.show()
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Step-2 set a theme
sns.set_theme(style="ticks", color_codes=True)
chilla_data=pd.read_csv("Chilla_data2_for_plots.csv")
chilla_data.head() # To get the headers of all data
| Gender | Location | Age | Qualification_completed | field_of_study | Purpose_for_chilla | What are you? | Blood group | Which mobile sim do you use | Prepaid or Postpaid | ... | Your favorite programming language? | Marital Status? | Are you Vaccinated? | Where do you live? | Research-Working experience_years | Age_years | Weight_kg | Height_cm | Coding Hours | Load_Shedding | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Male | Pakistan | 36-40 | Masters | Natural Sciences | to boost my skill set | Unemplyed | B+ | U-fone | Prepaid | ... | Python | Yes | Yes | Urbun | 5.0 | 38.00 | 77.0 | 179.000 | 3.0 | 2 |
| 1 | Male | Pakistan | 26-30 | Bachelors | CS/IT | to boost my skill set | Student | B+ | U-fone | Prepaid | ... | Python | No | Yes | Urbun | 1.0 | 25.00 | 53.6 | 178.000 | 2.0 | 6 |
| 2 | Male | Pakistan | 31-35 | Masters | Enginnering | Switch my field of study | Employed | B+ | Zong | Prepaid | ... | Python | Yes | Yes | Urbun | 5.5 | 31.34 | 93.0 | 173.000 | 2.0 | 0 |
| 3 | Female | Pakistan | 31-35 | Masters | CS/IT | to boost my skill set | Employed | O+ | U-fone | Postpaid | ... | Python | Yes | Yes | Urbun | 5.0 | 33.00 | 60.0 | 157.000 | 3.0 | 24 |
| 4 | Female | Pakistan | 26-30 | Masters | Enginnering | to boost my skill set | Student | A- | Mobilink | Prepaid | ... | Javascript | No | Yes | Rural | 3.5 | 27.00 | 59.9 | 164.544 | 6.0 | 12 |
5 rows � 23 columns
sns.boxplot(x="Coding Hours", y="field_of_study",hue="Location",data=chilla_data,
palette="Set2", saturation= 1, dodge= True)
plt.title("Boxplot for Chilla Students",size=10, weight="bold")
plt.show()
# Facet Grid, not have them all in a column or a row, but formatted in a grid (e.g. 2x4)
# tips = sns.load_dataset('chilla_data')
# ordered_days = sorted(tips['day'].unique())
# g = sns.FacetGrid(tips,col='day',col_order=ordered_days,col_wrap=2)
# # change this to 4 ^
# g.map(sns.boxplot,'sex','total_bill',palette='muted')
# for ax in g.axes.flatten():
# ax.tick_params(labelbottom=True)
# plt.tight_layout()
# plt.show()
g = sns.FacetGrid(chilla_data,col='Gender',row="Location")
# change this to 4 ^
g.map(sns.boxplot,"Coding Hours", "field_of_study",palette='muted')
C:\Users\Komal Khan\anaconda3\lib\site-packages\seaborn\axisgrid.py:670: UserWarning: Using the boxplot function without specifying `order` is likely to produce an incorrect plot. warnings.warn(warning)
<seaborn.axisgrid.FacetGrid at 0x1dd7edb7430>
1a. Worldwide Decline of Female employment in agriculture, forestry and fishing Sector (STACKED LINE PLOT)
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
sns.set_theme(style="whitegrid")
wew_data=pd.read_csv("wew_percent_data_1-13-2022.csv")
wew_data=wew_data.sort_values(by="Year")
# wew = wew_data.pivot("Year", "Area", "Value")
# sns.lineplot(data=wew_data, x="Year", y="Value", hue="Area")
fig=px.area(wew_data, x="Year", y="Value", color="Area",
line_group="Area",title="Worldwide Decline of Female employment in agriculture, forestry and fishing Sector",
labels={ "Value": "Female employment in agriculture, forestry and fishing (%)",
"Area": "Continent"
})
fig.update_layout(title={'font': {'size': 20}})
1b. Worldwide Decline of Female employment in agriculture, forestry and fishing Sector (LINE PLOT)
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
sns.set_theme(style="whitegrid")
wew_data=pd.read_csv("wew_percent_data_1-13-2022.csv")
wew_data=wew_data.sort_values(by="Year")
fig=px.line(wew_data, x="Year", y="Value", color="Area",
labels={ "Value": "Female employment in agriculture, forestry and fishing (%)",
"Area": "Continent"
}, line_shape='spline')
fig.update_layout(title={'font': {'size': 20}})
fig.update_yaxes(range = [0,70])
fig.update_traces(line=dict(width=4))
fig.update_layout(title={'text': '<b> Worldwide Decline of Female employment in agriculture, forestry and fishing Sector </b>'})
2. Population Growth Trend in Under War Countries of World (ANIMATED SCATTERED)
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
UWC_data=pd.read_csv("FAOSTAT_data_1-12-2022.csv")
# fig=px.bar(UWC_data, x="Area", y="Value",color="Area Code (FAO)",
# animation_frame="Year", animation_group="Area",
# hover_name="Area", range_y=[0,30000])
fig=px.scatter(UWC_data, x="Area", y="Value", animation_frame="Year", animation_group="Area",
size="Value", color="Area Code (FAO)", hover_name="Area",
size_max=55, range_y=[0,60000],
labels={ "Value": "Population (1000 persons)",
"Area": "Continent"
})
fig.update_layout(title={'font': {'size': 20}})
fig.update_layout(title={'text': '<b> Population Growth Trend in Under War Countries of World </b>'})
fig.show()
# import numpy as np
# import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt
# import plotly.express as px
# sns.set_theme(style="whitegrid")
# wew_data=pd.read_csv("wew_percent_data_1-13-2022.csv")
# wew_data=wew_data.sort_values(by="Year")
# fig = px.scatter(wew_data, x="Year",y="Value",color="Area",
# hover_data=['Value'])
# fig.show()
# pip install numpy # already install in jupyter
import numpy as np
a= np.array([2,3,7,8,0])
a
array([2, 3, 7, 8, 0])
# ndarray is a class of Numpy library which represents matrices and vectors.
# vectors are sigle dimentional arrays while matrices refers to two dimensional arrays.
# Dimensions are called axis.
type(a)
numpy.ndarray
# Length of the array
len(a)
5
# indexing in the the array
a[1]
3
# indexing all in the the array
a[0:]
array([2, 3, 7, 8, 0])
#sorting array
np.sort(a)
array([0, 2, 3, 7, 8])
# dimension of the array
np.ndim(a)
1
# to find unique elements in the array
np.unique(a)
array([ 2, 3, 6, 7, 45])
np.ones(3)
array([1., 1., 1.])
# default data type is floating point (np.float64), you can explicitly specify
np.ones(3, dtype=np.int64)
array([1, 1, 1], dtype=int64)
# generate arrays of zeros
np.zeros(4)
array([0., 0., 0., 0.])
# generate arrays of random numbers
rng = np.random.default_rng()
rng.random(3)
array([0.22812557, 0.63642781, 0.29747132])
# array with a range of elements
np.arange(5)
array([0, 1, 2, 3, 4])
# defined range of elements where last element is exclusive. reason will be
np.arange(5,10)
array([5, 6, 7, 8, 9])
# values that are spaced linearly in a specified interval
np.linspace(1,6, num=3)
array([1. , 3.5, 6. ])
# function empty creates an array whose initial content is random and depends on the state of the memory
b=np.empty(3)
b
array([1. , 3.5, 6. ])
fruit=np.array(["mango","bananna","apple"])
fruit
array(['mango', 'bananna', 'apple'], dtype='<U7')
price=np.array([50,35,45])
price
array([50, 35, 45])
price.mean()
43.333333333333336
x=np.array([34,5,68,30,6])
y=np.array([5,7,8,9,34,5,4])
x,y
(array([34, 5, 68, 30, 6]), array([ 5, 7, 8, 9, 34, 5, 4]))
z=np.concatenate((x,y))
z
array([34, 5, 68, 30, 6, 5, 7, 8, 9, 34, 5, 4])
np.sort(z)
array([ 4, 5, 5, 5, 6, 7, 8, 9, 30, 34, 34, 68])
a= np.array([[2,3,7,8,0], [3,45,6,7,8,]])
a
array([[ 2, 3, 7, 8, 0],
[ 3, 45, 6, 7, 8]])
np.ones((3,4))
array([[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.]])
# default data type is floating point (np.float64), you can explicitly specify
np.ones((3,4), dtype=np.int64)
array([[1, 1, 1, 1],
[1, 1, 1, 1],
[1, 1, 1, 1]], dtype=int64)
# Creat Array of zeros
np.zeros((3,4))
array([[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.]])
# Creat empty 2 -D Array
b=np.empty((3,4))
b
array([[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.]])
import numpy as np
a= np.array([5,5,])
a
array([5, 5])
type(a)
numpy.ndarray
#list of lists
b= np.array([[5,5,5],[5,5,5],[5,5,5]])
b
array([[5, 5, 5],
[5, 5, 5],
[5, 5, 5]])
# concatenate 2-D arrays
x = np.array([[1, 2], [3, 4]])
y = np.array([[5, 6]])
x,y
z=np.concatenate((x,y))
z
array([[1, 2],
[3, 4],
[5, 6]])
a= np.array([[2,3,7], [3,45,6,]])
b= np.array([[5,6,7,], [3,45,8,]])
a,b
(array([[ 2, 3, 7],
[ 3, 45, 6]]),
array([[ 5, 6, 7],
[ 3, 45, 8]]))
# Concatenate 2 arrays along axis 0 , size must be matching of both array
c=np.concatenate((a,b), axis=0)
c
array([[ 2, 3, 7],
[ 3, 45, 6],
[ 5, 6, 7],
[ 3, 45, 8]])
# Concatenate along axis 0
c=np.concatenate((a,b), axis=1)
c
array([[ 2, 3, 7, 5, 6, 7],
[ 3, 45, 6, 3, 45, 8]])
# Vertical and horizontal stacking (all the input arrays must have same number of dimensions)
np.vstack((a, b)), np.hstack((a, b))
(array([[ 2, 3, 7],
[ 3, 45, 6],
[ 5, 6, 7],
[ 3, 45, 8]]),
array([[ 2, 3, 7, 5, 6, 7],
[ 3, 45, 6, 3, 45, 8]]))
a=np.array([[[ 2, 3, 7],
[ 3, 45, 8]],
[[ 3, 45, 6],
[5, 6, 7]],
[[5, 5, 5],
[5, 5, 5]]])
a,a.ndim
array([[[ 2, 3, 7],
[ 3, 45, 8]],
[[ 3, 45, 6],
[ 5, 6, 7]],
[[ 5, 5, 5],
[ 5, 5, 5]]])
# to transpose arrays rows to column
a.T
array([[[ 2, 3, 5],
[ 3, 5, 5]],
[[ 3, 45, 5],
[45, 6, 5]],
[[ 7, 6, 5],
[ 8, 7, 5]]])
b=np.array([[[ 7, 3, 89],
[ 34, 4, 18]],
[[ 23, 5, 0],
[ 5, 6, 7]],
[[ 4, 4, 5],
[ 5, 5, 6]]])
b
array([[[ 7, 3, 89],
[34, 4, 18]],
[[23, 5, 0],
[ 5, 6, 7]],
[[ 4, 4, 5],
[ 5, 5, 6]]])
#making and reshaping array
np.arange(24).reshape(2,3,4) # size defined in reshape should be equal to size of array
array([[[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]],
[[12, 13, 14, 15],
[16, 17, 18, 19],
[20, 21, 22, 23]]])
a.size, a.shape
(6, (2, 3))
a=np.arange(9)
a
array([0, 1, 2, 3, 4, 5, 6, 7, 8])
# a is the array to be reshaped with the newshape which is the new shape you want. The shape should be compatible with the original shape.
np.reshape(a, newshape=(3,3) , order="C")
array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
# Using np.newaxis will increase the dimensions of your array by one dimension when used once.
a=np.arange(9)
a,a.shape,a.ndim
(array([0, 1, 2, 3, 4, 5, 6, 7, 8]), (9,), 1)
# 1-D to 2-D array conversion
a1 = a[np.newaxis, :]
a1,a1.shape,a1.ndim
(1, 9)
# 2-D to 3-D array conversion
a2 = a1[np.newaxis, :]
a2, a2.shape,a2.ndim
(array([[[0, 1, 2, 3, 4, 5, 6, 7, 8]]]), (1, 1, 9), 3)
# conversion of array to 1 D
x = np.array([[1 , 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
x,x.flatten(), x.ravel()
# difference of ravel() is that any changes after 1D conversion effects the original/parent array
(array([[ 1, 2, 3, 4],
[ 5, 6, 7, 8],
[ 9, 10, 11, 12]]),
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]))
# Indexing and slicing with arrays
a[3:6], a[-2:] # forward and reverse slicing
(array([3, 4, 5]), array([7, 8]))
#conditional indexing
a[a[(a > 2) & (a < 11)]]
array([3, 4, 5, 6, 7, 8])
# math functions with arrays
a.sum(), a.mean(), a.min(),a.max()
(36, 4.0, 0, 8)
# convert a 1D array with either a row vector or a column vector
row_vector = a[np.newaxis, :]
col_vector = a[:, np.newaxis]
row_vector, col_vector
# inserting a new axis
np.expand_dims(a, axis=1), np.expand_dims(a, axis=0)
(array([[0],
[1],
[2],
[3],
[4],
[5],
[6],
[7],
[8]]),
array([[0, 1, 2, 3, 4, 5, 6, 7, 8]]))
How to give headings in markdown?
Normal Text in Markdown.
Special Text
Second line
This is to check breaks in line
Second line goes
or
This is to check breaks in line\ Second line goes
block of words and heading
H2¶
Bold
Italic
Bold and Italic
or
Bold
Italic
Bold and Italic
And another item.
Some text that should be aligned with the above item.
This is page 1
Second Page
Reference-style:
![]()
Here's our logo.
Inline-style:
![]()
To print a string print("Codanics")'
print("Python")'
To write different language codes with colors according to their syntax
print("Python")
x+y=5
x=7
print("Python")
x+y=5
x=7
| Tables | Are | Cool |
|---|---|---|
| col 3 is | right-aligned | $1600 |
| col 2 is | centered | $12 |
| zebra stripes | are neat | $1 |
The outer pipes (|) are optional, and you don't need to make the raw Markdown line up prettily. You can also use inline Markdown.
| Markdown | Less | Pretty |
|---|---|---|
| Still | renders |
nicely |
| 1 | 2 | 3 |
Good reference is MAthJax for mathematical equations syntax Here are some examples to try out:
$$-b \pm \sqrt{b^2 - 4ac} \over 2a$$$$x = a_0 + \frac{1}{a_1 + \frac{1}{a_2 + \frac{1}{a_3 + a_4}}}$$$$\forall x \in X, \quad \exists y \leq \epsilon$$The beginning and ending dollar signs ($) are the delimiters for the TeX markup.
You can also $add_{math}$ and
$$ math^{blocks} $$But make sure you \$Escape \$your \$dollar signs \$you want to keep!
Headings \ Block of words\ Line breaks\ Combine two things\ Text Face\ Bullets \ Line/page breaks\ Links and Hyperlinks\ Figure links\ Adding code/code blocks\ Tables\ Mathematical Equations
# pip install pandas
# pip install numpy
import numpy as np
import pandas as pd
Creating a Series by passing a list of integer values
s = pd.Series([1, 3, 5, np.nan,7, 6, 8])
s
0 1.0 1 3.0 2 5.0 3 NaN 4 7.0 5 6.0 6 8.0 dtype: float64
Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns
dates = pd.date_range("20210101", periods=20)
dates
DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
'2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
'2021-01-09', '2021-01-10', '2021-01-11', '2021-01-12',
'2021-01-13', '2021-01-14', '2021-01-15', '2021-01-16',
'2021-01-17', '2021-01-18', '2021-01-19', '2021-01-20'],
dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(20, 4), index=dates, columns=list("ABCD"))
df
| A | B | C | D | |
|---|---|---|---|---|
| 2021-01-01 | 0.008654 | -0.126477 | 0.539719 | 1.105822 |
| 2021-01-02 | -1.227898 | 0.579727 | 0.175053 | 0.185574 |
| 2021-01-03 | -0.946407 | -1.118558 | -1.435824 | 0.554559 |
| 2021-01-04 | 0.657420 | -0.554898 | -0.512110 | -1.599138 |
| 2021-01-05 | 1.162640 | 0.515293 | 0.453209 | 1.454471 |
| 2021-01-06 | -0.052724 | -0.681884 | 1.182558 | -0.672382 |
| 2021-01-07 | 0.593081 | 0.043416 | -0.277710 | -0.070971 |
| 2021-01-08 | 0.033507 | 0.740535 | 1.709354 | 0.786476 |
| 2021-01-09 | -2.264824 | -0.300875 | -0.325400 | 0.534196 |
| 2021-01-10 | -0.193594 | -0.217825 | -0.056438 | -1.047139 |
| 2021-01-11 | 0.779426 | 0.599466 | -0.335718 | 0.547992 |
| 2021-01-12 | 1.324442 | 0.939933 | 0.274368 | 1.463821 |
| 2021-01-13 | 0.580095 | -0.370571 | -0.179992 | -1.142861 |
| 2021-01-14 | -0.144554 | 1.730484 | -0.749591 | -1.221470 |
| 2021-01-15 | 1.233195 | -0.101237 | 0.540558 | -0.999549 |
| 2021-01-16 | -1.813061 | -0.952302 | 1.751119 | 0.930238 |
| 2021-01-17 | -0.368305 | -1.509361 | -0.252288 | 0.875926 |
| 2021-01-18 | 0.544170 | -1.456745 | -2.187101 | 0.921900 |
| 2021-01-19 | -0.453111 | -0.645028 | 2.185281 | 0.041818 |
| 2021-01-20 | -0.560898 | -1.242427 | -0.574084 | -0.488877 |
Creating a DataFrame by passing a dict of objects that can be converted to series-like
df2 = pd.DataFrame(
{
"A": 1.0,
"B": pd.Timestamp("20220111"),
"C": pd.Series(1, index=list(range(4)), dtype="float32"),
"D": np.array([3] * 4, dtype="int32"),
"E": pd.Categorical(["girl", "woman", "girl", "woman"]),
"F": "females",
}
)
df2
df2.dtypes #The columns of the resulting DataFrame have different dtypes
A float64 B datetime64[ns] C float32 D int32 E category F object dtype: object
view the top and bottom rows of the frame
df.head(1)
| A | B | C | D | |
|---|---|---|---|---|
| 2021-01-01 | 0.008654 | -0.126477 | 0.539719 | 1.105822 |
df.tail(3) # number defines how many data rows
| A | B | C | D | |
|---|---|---|---|---|
| 2021-01-18 | 0.544170 | -1.456745 | -2.187101 | 0.921900 |
| 2021-01-19 | -0.453111 | -0.645028 | 2.185281 | 0.041818 |
| 2021-01-20 | -0.560898 | -1.242427 | -0.574084 | -0.488877 |
Display the index
df.index
DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
'2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
'2021-01-09', '2021-01-10', '2021-01-11', '2021-01-12',
'2021-01-13', '2021-01-14', '2021-01-15', '2021-01-16',
'2021-01-17', '2021-01-18', '2021-01-19', '2021-01-20'],
dtype='datetime64[ns]', freq='D')
DataFrame.to_numpy() gives a NumPy representation of the underlying data. When you call DataFrame.to_numpy(), pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. This may end up being object, which requires casting every value to a Python object.
df.to_numpy() #does not include the index or column labels in the output.
array([[ 0.00865439, -0.12647739, 0.53971875, 1.10582239],
[-1.22789804, 0.57972699, 0.17505269, 0.18557446],
[-0.94640689, -1.1185584 , -1.43582362, 0.55455913],
[ 0.65742019, -0.55489803, -0.51210964, -1.59913763],
[ 1.1626399 , 0.51529269, 0.45320869, 1.45447129],
[-0.05272386, -0.68188364, 1.18255781, -0.67238172],
[ 0.59308096, 0.04341616, -0.27770973, -0.07097116],
[ 0.03350683, 0.74053457, 1.70935432, 0.786476 ],
[-2.26482372, -0.30087523, -0.32540002, 0.5341962 ],
[-0.19359381, -0.21782531, -0.05643755, -1.0471391 ],
[ 0.77942596, 0.59946566, -0.33571777, 0.54799218],
[ 1.32444179, 0.93993339, 0.27436779, 1.46382077],
[ 0.58009458, -0.37057057, -0.17999158, -1.14286147],
[-0.1445542 , 1.73048425, -0.74959064, -1.22146951],
[ 1.2331945 , -0.10123727, 0.54055832, -0.99954909],
[-1.81306107, -0.95230174, 1.75111913, 0.93023769],
[-0.36830459, -1.50936091, -0.2522881 , 0.87592645],
[ 0.54417043, -1.45674516, -2.18710128, 0.92190002],
[-0.45311116, -0.64502841, 2.18528117, 0.04181784],
[-0.56089783, -1.24242699, -0.57408372, -0.48887749]])
Note that this can be an expensive operation when your DataFrame has columns with different data types, which comes down to a fundamental difference between pandas and NumPy: NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column.
df2.to_numpy()
array([[1.0, Timestamp('2022-01-11 00:00:00'), 1.0, 3, 'girl', 'females'],
[1.0, Timestamp('2022-01-11 00:00:00'), 1.0, 3, 'woman',
'females'],
[1.0, Timestamp('2022-01-11 00:00:00'), 1.0, 3, 'girl', 'females'],
[1.0, Timestamp('2022-01-11 00:00:00'), 1.0, 3, 'woman',
'females']], dtype=object)
describe() shows a quick statistic summary
df.describe()
| A | B | C | D | |
|---|---|---|---|---|
| count | 20.000000 | 20.000000 | 20.000000 | 20.000000 |
| mean | -0.055437 | -0.206467 | 0.096248 | 0.108020 |
| std | 0.976762 | 0.862346 | 1.057938 | 0.957663 |
| min | -2.264824 | -1.509361 | -2.187101 | -1.599138 |
| 25% | -0.480058 | -0.749488 | -0.379816 | -0.754174 |
| 50% | -0.022035 | -0.259350 | -0.118215 | 0.359885 |
| 75% | 0.609166 | 0.531401 | 0.539929 | 0.887420 |
| max | 1.324442 | 1.730484 | 2.185281 | 1.463821 |
Transposing data
df2.T
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| A | 1.0 | 1.0 | 1.0 | 1.0 |
| B | 2022-01-11 00:00:00 | 2022-01-11 00:00:00 | 2022-01-11 00:00:00 | 2022-01-11 00:00:00 |
| C | 1.0 | 1.0 | 1.0 | 1.0 |
| D | 3 | 3 | 3 | 3 |
| E | girl | woman | girl | woman |
| F | females | females | females | females |
Sorting by an axis , descending order
df.sort_index(axis=1, ascending=False)
| D | C | B | A | |
|---|---|---|---|---|
| 2021-01-01 | 1.105822 | 0.539719 | -0.126477 | 0.008654 |
| 2021-01-02 | 0.185574 | 0.175053 | 0.579727 | -1.227898 |
| 2021-01-03 | 0.554559 | -1.435824 | -1.118558 | -0.946407 |
| 2021-01-04 | -1.599138 | -0.512110 | -0.554898 | 0.657420 |
| 2021-01-05 | 1.454471 | 0.453209 | 0.515293 | 1.162640 |
| 2021-01-06 | -0.672382 | 1.182558 | -0.681884 | -0.052724 |
| 2021-01-07 | -0.070971 | -0.277710 | 0.043416 | 0.593081 |
| 2021-01-08 | 0.786476 | 1.709354 | 0.740535 | 0.033507 |
| 2021-01-09 | 0.534196 | -0.325400 | -0.300875 | -2.264824 |
| 2021-01-10 | -1.047139 | -0.056438 | -0.217825 | -0.193594 |
| 2021-01-11 | 0.547992 | -0.335718 | 0.599466 | 0.779426 |
| 2021-01-12 | 1.463821 | 0.274368 | 0.939933 | 1.324442 |
| 2021-01-13 | -1.142861 | -0.179992 | -0.370571 | 0.580095 |
| 2021-01-14 | -1.221470 | -0.749591 | 1.730484 | -0.144554 |
| 2021-01-15 | -0.999549 | 0.540558 | -0.101237 | 1.233195 |
| 2021-01-16 | 0.930238 | 1.751119 | -0.952302 | -1.813061 |
| 2021-01-17 | 0.875926 | -0.252288 | -1.509361 | -0.368305 |
| 2021-01-18 | 0.921900 | -2.187101 | -1.456745 | 0.544170 |
| 2021-01-19 | 0.041818 | 2.185281 | -0.645028 | -0.453111 |
| 2021-01-20 | -0.488877 | -0.574084 | -1.242427 | -0.560898 |
Sorting by an values , ascending by default
df.sort_values(by="B")
| A | B | C | D | |
|---|---|---|---|---|
| 2021-01-17 | -0.368305 | -1.509361 | -0.252288 | 0.875926 |
| 2021-01-18 | 0.544170 | -1.456745 | -2.187101 | 0.921900 |
| 2021-01-20 | -0.560898 | -1.242427 | -0.574084 | -0.488877 |
| 2021-01-03 | -0.946407 | -1.118558 | -1.435824 | 0.554559 |
| 2021-01-16 | -1.813061 | -0.952302 | 1.751119 | 0.930238 |
| 2021-01-06 | -0.052724 | -0.681884 | 1.182558 | -0.672382 |
| 2021-01-19 | -0.453111 | -0.645028 | 2.185281 | 0.041818 |
| 2021-01-04 | 0.657420 | -0.554898 | -0.512110 | -1.599138 |
| 2021-01-13 | 0.580095 | -0.370571 | -0.179992 | -1.142861 |
| 2021-01-09 | -2.264824 | -0.300875 | -0.325400 | 0.534196 |
| 2021-01-10 | -0.193594 | -0.217825 | -0.056438 | -1.047139 |
| 2021-01-01 | 0.008654 | -0.126477 | 0.539719 | 1.105822 |
| 2021-01-15 | 1.233195 | -0.101237 | 0.540558 | -0.999549 |
| 2021-01-07 | 0.593081 | 0.043416 | -0.277710 | -0.070971 |
| 2021-01-05 | 1.162640 | 0.515293 | 0.453209 | 1.454471 |
| 2021-01-02 | -1.227898 | 0.579727 | 0.175053 | 0.185574 |
| 2021-01-11 | 0.779426 | 0.599466 | -0.335718 | 0.547992 |
| 2021-01-08 | 0.033507 | 0.740535 | 1.709354 | 0.786476 |
| 2021-01-12 | 1.324442 | 0.939933 | 0.274368 | 1.463821 |
| 2021-01-14 | -0.144554 | 1.730484 | -0.749591 | -1.221470 |
Selecting a single column, which yields a Series, equivalent to df.A
df["A"]
2021-01-01 0.008654 2021-01-02 -1.227898 2021-01-03 -0.946407 2021-01-04 0.657420 2021-01-05 1.162640 2021-01-06 -0.052724 2021-01-07 0.593081 2021-01-08 0.033507 2021-01-09 -2.264824 2021-01-10 -0.193594 2021-01-11 0.779426 2021-01-12 1.324442 2021-01-13 0.580095 2021-01-14 -0.144554 2021-01-15 1.233195 2021-01-16 -1.813061 2021-01-17 -0.368305 2021-01-18 0.544170 2021-01-19 -0.453111 2021-01-20 -0.560898 Freq: D, Name: A, dtype: float64
Selecting via [], which slices the rows
#row wise selection
df[0:3]
| A | B | C | D | |
|---|---|---|---|---|
| 2021-01-01 | 0.008654 | -0.126477 | 0.539719 | 1.105822 |
| 2021-01-02 | -1.227898 | 0.579727 | 0.175053 | 0.185574 |
| 2021-01-03 | -0.946407 | -1.118558 | -1.435824 | 0.554559 |
df["20210102":"20210104"]
| A | B | C | D | |
|---|---|---|---|---|
| 2021-01-02 | -1.227898 | 0.579727 | 0.175053 | 0.185574 |
| 2021-01-03 | -0.946407 | -1.118558 | -1.435824 | 0.554559 |
| 2021-01-04 | 0.657420 | -0.554898 | -0.512110 | -1.599138 |
For getting a cross section using a label
df.loc[dates[0]]
A 0.008654 B -0.126477 C 0.539719 D 1.105822 Name: 2021-01-01 00:00:00, dtype: float64
Selecting on a multi-axis by label
df.loc[:, ["A", "B"]]
| A | B | |
|---|---|---|
| 2021-01-01 | 0.008654 | -0.126477 |
| 2021-01-02 | -1.227898 | 0.579727 |
| 2021-01-03 | -0.946407 | -1.118558 |
| 2021-01-04 | 0.657420 | -0.554898 |
| 2021-01-05 | 1.162640 | 0.515293 |
| 2021-01-06 | -0.052724 | -0.681884 |
| 2021-01-07 | 0.593081 | 0.043416 |
| 2021-01-08 | 0.033507 | 0.740535 |
| 2021-01-09 | -2.264824 | -0.300875 |
| 2021-01-10 | -0.193594 | -0.217825 |
| 2021-01-11 | 0.779426 | 0.599466 |
| 2021-01-12 | 1.324442 | 0.939933 |
| 2021-01-13 | 0.580095 | -0.370571 |
| 2021-01-14 | -0.144554 | 1.730484 |
| 2021-01-15 | 1.233195 | -0.101237 |
| 2021-01-16 | -1.813061 | -0.952302 |
| 2021-01-17 | -0.368305 | -1.509361 |
| 2021-01-18 | 0.544170 | -1.456745 |
| 2021-01-19 | -0.453111 | -0.645028 |
| 2021-01-20 | -0.560898 | -1.242427 |
Showing label slicing, both endpoints are included
df.loc["20130102":"20130104", ["A", "B"]]
| A | B |
|---|
Reduction in the dimensions of the returned object
df.loc["20210102", ["A", "B"]]
A -1.227898 B 0.579727 Name: 2021-01-02 00:00:00, dtype: float64
For getting a scalar value:
df.loc[dates[0], "A"]
0.008654390052754285
For getting fast access to a scalar (equivalent to the prior method)
df.at[dates[0], "A"]
0.008654390052754285
Select via the position of the passed integers, By integer slices, acting similar to NumPy/Python
df.iloc[3:5, 0:2]
| A | B | |
|---|---|---|
| 2021-01-04 | 0.65742 | -0.554898 |
| 2021-01-05 | 1.16264 | 0.515293 |
df.iloc[3]
A 0.657420 B -0.554898 C -0.512110 D -1.599138 Name: 2021-01-04 00:00:00, dtype: float64
df.iloc[[1, 2, 4], [0, 2]]
| A | C | |
|---|---|---|
| 2021-01-02 | -1.227898 | 0.175053 |
| 2021-01-03 | -0.946407 | -1.435824 |
| 2021-01-05 | 1.162640 | 0.453209 |
For slicing rows explicitly:
df.iloc[1:3, :] # 1st row at 0 place and 3rd row excluded
| A | B | C | D | |
|---|---|---|---|---|
| 2021-01-02 | -1.227898 | 0.579727 | 0.175053 | 0.185574 |
| 2021-01-03 | -0.946407 | -1.118558 | -1.435824 | 0.554559 |
For slicing columns explicitly:
df.iloc[:, 1:3] # 1st column at 0 place and 3rd column excluded
| B | C | |
|---|---|---|
| 2021-01-01 | -0.126477 | 0.539719 |
| 2021-01-02 | 0.579727 | 0.175053 |
| 2021-01-03 | -1.118558 | -1.435824 |
| 2021-01-04 | -0.554898 | -0.512110 |
| 2021-01-05 | 0.515293 | 0.453209 |
| 2021-01-06 | -0.681884 | 1.182558 |
| 2021-01-07 | 0.043416 | -0.277710 |
| 2021-01-08 | 0.740535 | 1.709354 |
| 2021-01-09 | -0.300875 | -0.325400 |
| 2021-01-10 | -0.217825 | -0.056438 |
| 2021-01-11 | 0.599466 | -0.335718 |
| 2021-01-12 | 0.939933 | 0.274368 |
| 2021-01-13 | -0.370571 | -0.179992 |
| 2021-01-14 | 1.730484 | -0.749591 |
| 2021-01-15 | -0.101237 | 0.540558 |
| 2021-01-16 | -0.952302 | 1.751119 |
| 2021-01-17 | -1.509361 | -0.252288 |
| 2021-01-18 | -1.456745 | -2.187101 |
| 2021-01-19 | -0.645028 | 2.185281 |
| 2021-01-20 | -1.242427 | -0.574084 |
For getting a value explicitly:
df.iloc[1, 1]
0.5797269868237717
For getting fast access to a scalar (equivalent to the prior method)
df.iat[1, 1]
0.5797269868237717
Using a single column�s values to select data
df[df["A"] > 0]
| A | B | C | D | |
|---|---|---|---|---|
| 2021-01-01 | 0.008654 | -0.126477 | 0.539719 | 1.105822 |
| 2021-01-04 | 0.657420 | -0.554898 | -0.512110 | -1.599138 |
| 2021-01-05 | 1.162640 | 0.515293 | 0.453209 | 1.454471 |
| 2021-01-07 | 0.593081 | 0.043416 | -0.277710 | -0.070971 |
| 2021-01-08 | 0.033507 | 0.740535 | 1.709354 | 0.786476 |
| 2021-01-11 | 0.779426 | 0.599466 | -0.335718 | 0.547992 |
| 2021-01-12 | 1.324442 | 0.939933 | 0.274368 | 1.463821 |
| 2021-01-13 | 0.580095 | -0.370571 | -0.179992 | -1.142861 |
| 2021-01-15 | 1.233195 | -0.101237 | 0.540558 | -0.999549 |
| 2021-01-18 | 0.544170 | -1.456745 | -2.187101 | 0.921900 |
Using a multiple column�s values to select data
df[(df["A"] > 0) & (df["B"] > 0)] # () parenthesis to be used for & AND and | OR operator
| A | B | C | D | |
|---|---|---|---|---|
| 2021-01-05 | 1.162640 | 0.515293 | 0.453209 | 1.454471 |
| 2021-01-07 | 0.593081 | 0.043416 | -0.277710 | -0.070971 |
| 2021-01-08 | 0.033507 | 0.740535 | 1.709354 | 0.786476 |
| 2021-01-11 | 0.779426 | 0.599466 | -0.335718 | 0.547992 |
| 2021-01-12 | 1.324442 | 0.939933 | 0.274368 | 1.463821 |
Selecting values from a DataFrame where a boolean condition is met
df[df > 0]
| A | B | C | D | |
|---|---|---|---|---|
| 2021-01-01 | 0.008654 | NaN | 0.539719 | 1.105822 |
| 2021-01-02 | NaN | 0.579727 | 0.175053 | 0.185574 |
| 2021-01-03 | NaN | NaN | NaN | 0.554559 |
| 2021-01-04 | 0.657420 | NaN | NaN | NaN |
| 2021-01-05 | 1.162640 | 0.515293 | 0.453209 | 1.454471 |
| 2021-01-06 | NaN | NaN | 1.182558 | NaN |
| 2021-01-07 | 0.593081 | 0.043416 | NaN | NaN |
| 2021-01-08 | 0.033507 | 0.740535 | 1.709354 | 0.786476 |
| 2021-01-09 | NaN | NaN | NaN | 0.534196 |
| 2021-01-10 | NaN | NaN | NaN | NaN |
| 2021-01-11 | 0.779426 | 0.599466 | NaN | 0.547992 |
| 2021-01-12 | 1.324442 | 0.939933 | 0.274368 | 1.463821 |
| 2021-01-13 | 0.580095 | NaN | NaN | NaN |
| 2021-01-14 | NaN | 1.730484 | NaN | NaN |
| 2021-01-15 | 1.233195 | NaN | 0.540558 | NaN |
| 2021-01-16 | NaN | NaN | 1.751119 | 0.930238 |
| 2021-01-17 | NaN | NaN | NaN | 0.875926 |
| 2021-01-18 | 0.544170 | NaN | NaN | 0.921900 |
| 2021-01-19 | NaN | NaN | 2.185281 | 0.041818 |
| 2021-01-20 | NaN | NaN | NaN | NaN |
Using the isin() method for filtering
df2 = df.copy()
df2["g"] = [ "one", "two", "three", "four", "three","one", "two", "three", "four", "three", "one", "two", "three", "four", "three", "one", "two", "three", "four", "three"]
take row wise mean of A,B,C,D
df2["mean"] = df[["A","B","C","D"]].mean(1)
np.mean([-1.087499,1.664167,-2.160685,-0.161921])
-0.4364845
df2=df2[["A","B","C","D","mean"]] # To udate
#import libraries
import seaborn as sns
import pandas as pd
#load dataset
phool= sns.load_dataset("iris")
phool.describe()
| sepal_length | sepal_width | petal_length | petal_width | |
|---|---|---|---|---|
| count | 150.000000 | 150.000000 | 150.000000 | 150.000000 |
| mean | 5.843333 | 3.057333 | 3.758000 | 1.199333 |
| std | 0.828066 | 0.435866 | 1.765298 | 0.762238 |
| min | 4.300000 | 2.000000 | 1.000000 | 0.100000 |
| 25% | 5.100000 | 2.800000 | 1.600000 | 0.300000 |
| 50% | 5.800000 | 3.000000 | 4.350000 | 1.300000 |
| 75% | 6.400000 | 3.300000 | 5.100000 | 1.800000 |
| max | 7.900000 | 4.400000 | 6.900000 | 2.500000 |
phool.head()
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
Mode:Computed column-wise:
phool.mode()
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.0 | 3.0 | 1.4 | 0.2 | setosa |
| 1 | NaN | NaN | 1.5 | NaN | versicolor |
| 2 | NaN | NaN | NaN | NaN | virginica |
Mode:Computed row-wise:
phool.astype(str).mode(axis=1)
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| 0 | 0.2 | 1.4 | 3.5 | 5.1 | setosa |
| 1 | 0.2 | 1.4 | 3.0 | 4.9 | setosa |
| 2 | 0.2 | 1.3 | 3.2 | 4.7 | setosa |
| 3 | 0.2 | 1.5 | 3.1 | 4.6 | setosa |
| 4 | 0.2 | 1.4 | 3.6 | 5.0 | setosa |
| ... | ... | ... | ... | ... | ... |
| 145 | 2.3 | 3.0 | 5.2 | 6.7 | virginica |
| 146 | 1.9 | 2.5 | 5.0 | 6.3 | virginica |
| 147 | 2.0 | 3.0 | 5.2 | 6.5 | virginica |
| 148 | 2.3 | 3.4 | 5.4 | 6.2 | virginica |
| 149 | 1.8 | 3.0 | 5.1 | 5.9 | virginica |
150 rows � 5 columns
Median:Computed column-wise:
phool.median()
C:\Users\Komal Khan\AppData\Local\Temp\ipykernel_20792\4022828141.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction. phool.median()
sepal_length 5.80 sepal_width 3.00 petal_length 4.35 petal_width 1.30 dtype: float64
Median:Computed row-wise:
phool.median(axis=1)
C:\Users\Komal Khan\AppData\Local\Temp\ipykernel_20792\3347908373.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction. phool.median(axis=1)
0 2.45
1 2.20
2 2.25
3 2.30
4 2.50
...
145 4.10
146 3.75
147 4.10
148 4.40
149 4.05
Length: 150, dtype: float64
#select kernel and then import important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#Loading Titanic dataset
kashti= sns.load_dataset("titanic")
#Converting the dataset to csv file
kashti.to_csv('kashti.csv')
kashti.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 survived 891 non-null int64 1 pclass 891 non-null int64 2 sex 891 non-null object 3 age 714 non-null float64 4 sibsp 891 non-null int64 5 parch 891 non-null int64 6 fare 891 non-null float64 7 embarked 889 non-null object 8 class 891 non-null category 9 who 891 non-null object 10 adult_male 891 non-null bool 11 deck 203 non-null category 12 embark_town 889 non-null object 13 alive 891 non-null object 14 alone 891 non-null bool dtypes: bool(2), category(2), float64(2), int64(4), object(5) memory usage: 80.7+ KB
# Saving in another variable to make some cleaning and later compare them
ks=kashti
To know how the data looks like and what length do we have:
ks.head() # First five rows
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
# To know the number of rows and columns of the data
ks.shape
(891, 15)
ks.tail() # last five rows
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 886 | 0 | 2 | male | 27.0 | 0 | 0 | 13.00 | S | Second | man | True | NaN | Southampton | no | True |
| 887 | 1 | 1 | female | 19.0 | 0 | 0 | 30.00 | S | First | woman | False | B | Southampton | yes | True |
| 888 | 0 | 3 | female | NaN | 1 | 2 | 23.45 | S | Third | woman | False | NaN | Southampton | no | False |
| 889 | 1 | 1 | male | 26.0 | 0 | 0 | 30.00 | C | First | man | True | C | Cherbourg | yes | True |
| 890 | 0 | 3 | male | 32.0 | 0 | 0 | 7.75 | Q | Third | man | True | NaN | Queenstown | no | True |
ks.describe() # give info about the numeric variables
| survived | pclass | age | sibsp | parch | fare | |
|---|---|---|---|---|---|---|
| count | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
| mean | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
| std | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
| min | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
| 50% | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
| 75% | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
| max | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
# Unique values
ks.nunique() # numerical values and categorical values
survived 2 pclass 3 sex 2 age 88 sibsp 7 parch 7 fare 248 embarked 3 class 3 who 3 adult_male 2 deck 7 embark_town 3 alive 2 alone 2 dtype: int64
# Column Names
ks.columns
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
'alive', 'alone'],
dtype='object')
ks['who'].unique() # No need to scroll down to the dataset
array(['man', 'woman', 'child'], dtype=object)
Assignment To get unique objects of mutiple columns, two ways are mentioned below: 1- By Concatenation of columns
pd.concat((ks['who'],(ks['class']),(ks['adult_male']))).unique()
array(['man', 'woman', 'child', 'Third', 'First', 'Second', True, False],
dtype=object)
1- By Appending columns
(ks['who'].append((ks['adult_male'],(ks['class'])))).unique()
array(['man', 'woman', 'child', True, False, 'Third', 'First', 'Second'],
dtype=object)
ks.isnull() # True means null values, False means no
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False |
| 1 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False |
| 3 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False |
| 887 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 888 | False | False | False | True | False | False | False | False | False | False | False | True | False | False | False |
| 889 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
| 890 | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False |
891 rows � 15 columns
ks.isnull().sum() # summation of null values in each column
survived 0 pclass 0 sex 0 age 177 sibsp 0 parch 0 fare 0 embarked 2 class 0 who 0 adult_male 0 deck 688 embark_town 2 alive 0 alone 0 dtype: int64
# Data cleaning, Removing column of more missing values
ks_clean= ks.drop(['deck'],axis=1)
ks_clean.head()
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | Southampton | yes | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | Southampton | yes | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | Southampton | no | True |
ks_clean.isnull().sum()
survived 0 pclass 0 sex 0 age 177 sibsp 0 parch 0 fare 0 embarked 2 class 0 who 0 adult_male 0 embark_town 2 alive 0 alone 0 dtype: int64
ks_clean.shape
(891, 14)
ks_clean=ks_clean.dropna() # cleaning missing values and updating
ks_clean.shape
(712, 14)
ks_clean.isnull().sum()
survived 0 pclass 0 sex 0 age 0 sibsp 0 parch 0 fare 0 embarked 0 class 0 who 0 adult_male 0 embark_town 0 alive 0 alone 0 dtype: int64
ks.shape # comparing the shapes before and after removing the values
(891, 15)
ks_clean['sex'].value_counts() # to know how many values
male 453 female 259 Name: sex, dtype: int64
ks.describe()
| survived | pclass | age | sibsp | parch | fare | |
|---|---|---|---|---|---|---|
| count | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
| mean | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
| std | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
| min | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
| 50% | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
| 75% | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
| max | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
ks_clean.describe()
| survived | pclass | age | sibsp | parch | fare | |
|---|---|---|---|---|---|---|
| count | 712.000000 | 712.000000 | 712.000000 | 712.000000 | 712.000000 | 712.000000 |
| mean | 0.404494 | 2.240169 | 29.642093 | 0.514045 | 0.432584 | 34.567251 |
| std | 0.491139 | 0.836854 | 14.492933 | 0.930692 | 0.854181 | 52.938648 |
| min | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 1.000000 | 20.000000 | 0.000000 | 0.000000 | 8.050000 |
| 50% | 0.000000 | 2.000000 | 28.000000 | 0.000000 | 0.000000 | 15.645850 |
| 75% | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 1.000000 | 33.000000 |
| max | 1.000000 | 3.000000 | 80.000000 | 5.000000 | 6.000000 | 512.329200 |
ks_clean.columns
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive',
'alone'],
dtype='object')
sns.boxplot(x='sex',y='age', data=ks_clean) # Out liers can be seen above 70 age , we'll remove it.
<AxesSubplot:xlabel='sex', ylabel='age'>
sns.distplot(ks_clean['age'])
C:\Users\Komal Khan\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='age', ylabel='Density'>
# Removing outliers
ks_clean['age'].mean()
29.64209269662921
ks_clean=ks_clean[ks_clean['age']<60]
ks_clean.head()
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | embark_town | alive | alone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | Southampton | no | False |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | Cherbourg | yes | False |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | Southampton | yes | True |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | Southampton | yes | False |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | Southampton | no | True |
sns.boxplot(y='age', data=ks_clean)
<AxesSubplot:ylabel='age'>
ks_clean.boxplot() # in fare column we've different classes of fare, therefore outliers can be seen we can simply a bit
<AxesSubplot:>
ks_clean=ks_clean[ks_clean['fare']<300]
ks_clean.boxplot()
<AxesSubplot:>
sns.distplot(ks_clean['fare']) # to see dispersions and normality check
C:\Users\Komal Khan\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='fare', ylabel='Density'>
ks_clean.hist() # options to save image and open in seperate window
array([[<AxesSubplot:title={'center':'survived'}>,
<AxesSubplot:title={'center':'pclass'}>],
[<AxesSubplot:title={'center':'age'}>,
<AxesSubplot:title={'center':'sibsp'}>],
[<AxesSubplot:title={'center':'parch'}>,
<AxesSubplot:title={'center':'fare'}>]], dtype=object)
Graphical view of values
pd.value_counts(ks_clean['survived']).plot.bar()
<AxesSubplot:>
ks_clean.groupby(['sex','class']).mean()
| survived | pclass | age | sibsp | parch | fare | adult_male | alone | ||
|---|---|---|---|---|---|---|---|---|---|
| sex | class | ||||||||
| female | First | 0.962500 | 1.0 | 33.550000 | 0.550000 | 0.525000 | 104.373699 | 0.000000 | 0.362500 |
| Second | 0.918919 | 2.0 | 28.722973 | 0.500000 | 0.621622 | 21.951070 | 0.000000 | 0.405405 | |
| Third | 0.455446 | 3.0 | 21.341584 | 0.831683 | 0.960396 | 15.937625 | 0.000000 | 0.366337 | |
| male | First | 0.423529 | 1.0 | 37.440235 | 0.411765 | 0.305882 | 63.216519 | 0.964706 | 0.505882 |
| Second | 0.147368 | 2.0 | 29.319263 | 0.378947 | 0.242105 | 21.260000 | 0.905263 | 0.631579 | |
| Third | 0.152610 | 3.0 | 25.847068 | 0.497992 | 0.261044 | 12.239556 | 0.887550 | 0.734940 |
ks.groupby(['sex','class']).mean()
| survived | pclass | age | sibsp | parch | fare | adult_male | alone | ||
|---|---|---|---|---|---|---|---|---|---|
| sex | class | ||||||||
| female | First | 0.968085 | 1.0 | 34.611765 | 0.553191 | 0.457447 | 106.125798 | 0.000000 | 0.361702 |
| Second | 0.921053 | 2.0 | 28.722973 | 0.486842 | 0.605263 | 21.970121 | 0.000000 | 0.421053 | |
| Third | 0.500000 | 3.0 | 21.750000 | 0.895833 | 0.798611 | 16.118810 | 0.000000 | 0.416667 | |
| male | First | 0.368852 | 1.0 | 41.281386 | 0.311475 | 0.278689 | 67.226127 | 0.975410 | 0.614754 |
| Second | 0.157407 | 2.0 | 30.740707 | 0.342593 | 0.222222 | 19.741782 | 0.916667 | 0.666667 | |
| Third | 0.135447 | 3.0 | 26.507589 | 0.498559 | 0.224784 | 12.661633 | 0.919308 | 0.760807 |
Relationship
corr_ks_clean=ks_clean.corr()
sns.heatmap(corr_ks_clean)
<AxesSubplot:>
sns.heatmap(corr_ks_clean, annot=True)
<AxesSubplot:>
relation of numeric variables , the more numeric variables we have the more relations can be drawn
sns.relplot(x='age',y='fare', hue='sex', data=ks_clean)
<seaborn.axisgrid.FacetGrid at 0x1e696a2a9b0>
Categorical plotting:
sns.catplot(x='sex',y='fare',hue='sex',data=ks_clean, kind='bar')
<seaborn.axisgrid.FacetGrid at 0x1e696a60cd0>
sns.catplot(x='sex',y='age',hue='sex',data=ks_clean, kind='box')
<seaborn.axisgrid.FacetGrid at 0x1e696a43730>
Log Transformation
ks_clean['fare_log']=np.log(ks_clean['fare'])
ks_clean.head()
C:\Users\Komal Khan\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\arraylike.py:364: RuntimeWarning: divide by zero encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | embark_town | alive | alone | fare_log | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | Southampton | no | False | 1.981001 |
| 1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | Cherbourg | yes | False | 4.266662 |
| 2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | Southampton | yes | True | 2.070022 |
| 3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | Southampton | yes | False | 3.972177 |
| 4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | Southampton | no | True | 2.085672 |
Box Plot of fare and fare log (By taking fare log , outliers can be removed and data can be normalised)
sns.catplot(x='sex',y='fare',hue='sex',data=ks_clean, kind='box')
<seaborn.axisgrid.FacetGrid at 0x1e697bf8ee0>
sns.catplot(x='sex',y='fare_log',hue='sex',data=ks_clean, kind='box')
<seaborn.axisgrid.FacetGrid at 0x1e697ddff70>